Synthetic ACS Evaluation

library(tidyverse)
library(furrr)
library(here)
library(tidysynthesis)
library(syntheval)
library(tictoc)
library(patchwork)
library(urbnthemes)

set_urbn_defaults(style = "print")

source(here("R", "prediction_test.R"))
source(here("R", "kmarginals.R"))
source(here("R", "proportions.R"))
source(here("R", "discriminator_auc.R"))
source(here("R", "regression_ci_overlap.R"))
source(here("R", "membership_inference_test.R"))

source(here("R", "summarize_results.R"))

Load ACS

gsds <- readRDS(here("data", "starting", "acs-gsds.rds")) |>
  select(-serial)

holdout <- readRDS(here("data", "starting", "acs-holdout.rds")) |>
  select(-serial)

set.seed(20230822)
holdout_frac <- holdout  |>
  slice_sample(prop = 0.1)

syntheses <- list.files(here::here("data", "results", "acs"), full.names = TRUE)

syntheses <- syntheses[str_detect(syntheses, pattern = "synth-acs_")]

syntheses <- syntheses |>
  map(readRDS)

# syntheses <- syntheses[1:2]

# plan(multisession, workers = 4)
# message("Number of parallel workers: ", nbrOfWorkers())

if (!file.exists(here("data", "results", "acs-metrics.rds"))) {

  set.seed(20230818)
  tic()
  results <-
    map(
      .x = syntheses,
      .f = ~summarize_results(postsynth = .x, data = gsds, holdout = holdout_frac),
      .progress = TRUE
    )
  toc()
  
  saveRDS(results, here("data", "results", "acs-metrics.rds"))
  
} else {
  
  results <- readRDS(here("data", "results", "acs-metrics.rds"))
  
}



specs <- read_csv(here::here("data", "results", "specs.csv")) |>
  mutate(
    opt_in = factor(opt_in),
    white_multiplier = factor(white_multiplier)
  )

Univariate Utility

Proportions

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 32)),
  results |>
    map("proportions") |>
    map_dfr(.f = ~.x |> 
              pivot_wider(id_cols = c(variable, class), names_from = source, values_from = prop)
    ) |>
    mutate(abs_error = abs(original - synthetic))
) |>
  filter(white_multiplier == "1") |>
  ggplot(aes(opt_in, abs_error)) +
  geom_point(alpha = 0.4) +
  facet_wrap(~variable) +
  labs(
    x = "Proportion Opt In",
    y = "Absolute Error In Proportion"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 97)),
  results |>
    map("proportions_race_eth") |>
    map_dfr(.f = ~.x |> 
              pivot_wider(id_cols = c(variable, class, race_eth), names_from = source, values_from = prop)
    ) |>
    mutate(error = original - synthetic)
) |>
  filter(white_multiplier == 1) |>
  filter(!variable %in% c("hispan", "race")) |>
  ggplot(aes(opt_in, error, color = race_eth)) +
  geom_point(alpha = 0.4) +
  facet_wrap(~variable) +
  labs(
    x = "Proportion Opt In",
    y = "Error In Proportion"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 97)),
  results |>
    map("proportions_race_eth") |>
    map_dfr(.f = ~.x |> 
              pivot_wider(id_cols = c(variable, class, race_eth), names_from = source, values_from = prop)
    ) |>
    mutate(abs_error = abs(original - synthetic))
) |>
  filter(white_multiplier == 1) |>
  filter(!variable %in% c("hispan", "race")) |>
  ggplot(aes(opt_in, abs_error, color = race_eth)) +
  geom_point(alpha = 0.4) +
  facet_wrap(~variable) +
  labs(
    x = "Proportion Opt In",
    y = "Absolute Error In Proportion"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 97)),
  results |>
    map("proportions_race_eth") |>
    map_dfr(.f = ~.x |> 
              pivot_wider(id_cols = c(variable, class, race_eth), names_from = source, values_from = prop)
    ) |>
    mutate(error = original - synthetic)
) |>
  filter(id == "011", race_eth == "Black", variable == "marst")

Proportion with Wages

bind_cols(
  specs,
  results |>
    map("moments") |>
    map_dfr(.f = ~.x |> filter(variable == "incwage", statistic == "have"))
) |>
  ggplot(aes(opt_in, proportion_difference, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    y = "Error in Proportion with Salaries and Wages"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 9)),
  results |>
    map("moments_race") |>
    map_dfr(.f = ~.x |> filter(variable == "incwage", statistic == "have"))
) |>
  ggplot(aes(opt_in, proportion_difference, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0) +
  facet_wrap(~race) +
  labs(
    y = "Error in Proportion with Salaries and Wages",
    color = "White Multiplier"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 4)),
  results |>
    map("moments_race_eth") |>
    map_dfr(.f = ~.x |> filter(variable == "incwage", statistic == "have"))
) |>
  ggplot(aes(opt_in, proportion_difference, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0) +
  facet_wrap(~ race_eth) +
  labs(
    y = "Error in Proportion with Salaries and Wages",
    color = "White Multiplier"
  ) +
  scatter_grid()

Mean Income

bind_cols(
  specs,
  results |>
    map("moments") |>
    map_dfr(.f = ~.x |> filter(variable == "incwage", statistic == "mean"))
) |>
  ggplot(aes(opt_in, proportion_difference, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    x = "Proportion Opt In",
    y = "Proportion Error in Mean Wage and Salary Income"
  ) +
  scatter_grid()

bind_cols(
  specs |> 
    slice(rep(1:n(), each = 9)),
  results |>
    map("moments_race") |>
    map_dfr(.f = ~.x |> filter(variable == "incwage", statistic == "mean"))
) |>
  ggplot(aes(opt_in, proportion_difference, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0) +
  facet_wrap(~race) +
  labs(
    x = "Proportion Opt In",
    y = "Proportion Error in Mean Wage and Salary Income",
    color = "White Multiplier"
  ) +
  scatter_grid()

Increasing opt in for small groups can increase their utility.

Percentiles

bind_cols(
  specs |>
    slice(rep(1:n(), each = 11)),
  results |>
    map("percentiles") |>
    map_dfr(.f = ~ .x |> filter(variable == "incwage"))
) |>
  mutate(proportion_difference = if_else(is.nan(proportion_difference), 0, proportion_difference)) |>
  ggplot(aes(p, proportion_difference, group = id)) +
  geom_line(alpha = 0.5) +
  facet_wrap(~opt_in) +
  labs(
    x = "Percentile",
    y = "Proportion Error"
  ) +
  scatter_grid()

bind_cols(
  specs |>
    slice(rep(1:n(), each = 11)),
  results |>
    map("percentiles") |>
    map_dfr(.f = ~ .x |> filter(variable == "ftotinc"))
) |>
  mutate(proportion_difference = if_else(is.nan(proportion_difference), 0, proportion_difference)) |>
  ggplot(aes(p, proportion_difference, group = id)) +
  geom_line(alpha = 0.5) +
  facet_wrap(~opt_in) +
  labs(
    x = "Percentile",
    y = "Proportion Error"
  ) +
  scatter_grid()

bind_cols(
  specs |>
    slice(rep(1:n(), each = 11)),
  results |>
    map("percentiles") |>
    map_dfr(.f = ~ .x |> filter(variable == "inctot"))
) |>
  mutate(proportion_difference = if_else(is.nan(proportion_difference), 0, proportion_difference)) |>
  ggplot(aes(p, proportion_difference, group = id)) +
  geom_line(alpha = 0.5) +
  facet_wrap(~opt_in) +
  labs(
    x = "Percentile",
    y = "Proportion Error"
  ) +
  scatter_grid()

bind_cols(
  specs |>
    slice(rep(1:n(), each = 11)),
  results |>
    map("percentiles") |>
    map_dfr(.f = ~ .x |> filter(variable == "incresid"))
) |>
  mutate(proportion_difference = if_else(is.nan(proportion_difference), 0, proportion_difference)) |>
  ggplot(aes(p, proportion_difference, group = id)) +
  geom_line(alpha = 0.5) +
  facet_wrap(~opt_in) +
  labs(
    x = "Percentile",
    y = "Proportion Error"
  ) +
  scatter_grid()

Bivariate Utility

bind_cols(
  specs,
  correlation_difference_mae = results |>
    map_dbl(.f = ~ .x$cor_fit$correlation_difference_mae)
) |>
  ggplot(aes(opt_in, correlation_difference_mae, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  scale_y_continuous(limits = c(0, NA)) +
  labs(
    x = "Proportion Opt In",
    y = "Mean Absolute Error in Pairwise Correlation Coefficient"
  ) +
  scatter_grid()

bind_cols(
  specs,
  correlation_difference_mae = results |>
    map_dbl(.f = ~ .x$cor_fit$correlation_difference_mae)
) |>
  group_by(opt_in, white_multiplier) |>
  summarize(correlation_difference_mae = median(correlation_difference_mae)) |>
  ungroup() |>
  ggplot(aes(opt_in, correlation_difference_mae, color = white_multiplier, group = white_multiplier)) +
  geom_line() +
  geom_point() +
  labs(
    x = "Proportion Opt In",
    y = "Median Mean Absolute Error in Pairwise Correlation Coefficient"
  ) +
  scatter_grid()

Multivariate Utility

k1_benchmark <- kmarginals(
  postsynth = holdout, 
  data = gsds, 
  k = 1
)

bind_cols(
  specs,
  k1_marginals = results |>
    map_dbl("k1_marginals")
) |>
  ggplot(aes(opt_in, k1_marginals, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = k1_benchmark, color = "red") +
  scale_y_continuous(limits = c(NA, 1000)) +
  labs(
    subtitle = "Score from Holdout Data in Red",
    x = "Opt-In Rate",
    y = "1-Marginal Score"
  ) +
  scatter_grid()

k2_benchmark <- kmarginals(
  postsynth = holdout, 
  data = gsds, 
  k = 2
)

bind_cols(
  specs,
  k2_marginals = results |>
    map_dbl("k2_marginals")
) |>
  ggplot(aes(opt_in, k2_marginals, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = k2_benchmark, color = "red") +
  scale_y_continuous(limits = c(NA, 1000)) +  
  labs(
    subtitle = "Score from Holdout Data in Red",
    x = "Opt-In Rate",
    y = "2-Marginal Score"
  ) +  
  scatter_grid()

k3_benchmark <- kmarginals(
  postsynth = holdout, 
  data = gsds, 
  k = 3
)

bind_cols(
  specs,
  k3_marginals = results |>
    map_dbl("k3_marginals")
) |>
  ggplot(aes(opt_in, k3_marginals, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = k3_benchmark, color = "red") +  
  scale_y_continuous(limits = c(NA, 1000)) +  
  labs(
    subtitle = "Score from Holdout Data in Red",
    x = "Opt-In Rate",
    y = "3-Marginal Score"
  ) +    
  scatter_grid()

k1 <- bind_cols(
  specs,
  k1_marginals = results |>
    map_dbl("k1_marginals")
) |>
  ggplot(aes(opt_in, k1_marginals, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = k1_benchmark, color = "red") +
  scale_y_continuous(limits = c(NA, 1000)) +
  labs(
    x = "Opt-In Rate",
    y = "1-Marginal Score"
  ) +
  scatter_grid()

k3 <- bind_cols(
  specs,
  k3_marginals = results |>
    map_dbl("k3_marginals")
) |>
  ggplot(aes(opt_in, k3_marginals, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = k3_benchmark, color = "red") +  
  scale_y_continuous(limits = c(NA, 1000)) +  
  labs(
    x = "Opt-In Rate",
    y = "3-Marginal Score"
  ) +    
  scatter_grid()

k1 + k3

As k increases, the opt-in has a much bigger relative impact than the white multiplier.

bind_cols(
  specs,
  discriminator_auc = results |>
    map_dbl("discriminator_auc")
) |>
  ggplot(aes(opt_in, discriminator_auc, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    y = "AUC for Discrimnator Model"
  ) +
  scatter_grid()

Specific Utility

regression_ci_overlap <- bind_cols(
  specs,
  discriminator_auc = results |>
    map_dfr(
      .f = ~ .x$regression_ci_overlap$ci_overlap |>
        summarize(
          ci_overlap = mean(overlap),
          abs_std_coef_diff = mean(abs(std_coef_diff)),
          sign_match = mean(sign_match),
          inference_match = mean(inference_match)
        )
    )
)

# CI difference
regression_ci_overlap |>
  ggplot(aes(opt_in, abs_std_coef_diff, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    x = "Opt-In Rate"
  ) +
  scatter_grid()

# overlap
regression_ci_overlap |>
  ggplot(aes(opt_in, ci_overlap, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    x = "Opt-In Rate",
    y = "Regression Confidence Interval Overlap"
  ) +
  scatter_grid()

# sign match
regression_ci_overlap |>
  ggplot(aes(opt_in, sign_match, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    x = "Opt-In Rate"
  ) +
  scatter_grid()

# significance match
regression_ci_overlap |>
  ggplot(aes(opt_in, inference_match, color = white_multiplier)) +
  geom_point(alpha = 0.3) +
  labs(
    x = "Opt-In Rate"
  ) +
  scatter_grid()

Disclosure Metrics

L-Diversity

bind_cols(
  specs,
  discriminator_auc = results |>
    map_dfr("ldiveristy")
) |>
  group_by(opt_in, white_multiplier) |>
  summarize(
    max(ldiveristy_age),
    max(ldiveristy_educd),
    max(ldiveristy_ftotinc),
    max(ldiveristy_inctot),
    max(ldiveristy_incwage)
  )

Attribute Inference Test

holdout_rmse <- prediction_test(
  postsynth = holdout, 
  data = gsds, 
  formula = incwelfr ~ .
)

bind_cols(
  specs,
  incwelfr_rmse = results |>
    map_dbl("incwelfr_rmse")
) |>
  ggplot(aes(opt_in, incwelfr_rmse)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = holdout_rmse, color = "red") +
  labs(
    x = "Opt-In Rate",
    y = "Welfare Income RMSE"
  )

  scatter_grid()
List of 1
 $ panel.grid.major.x:List of 6
  ..$ colour       : chr "#dedddd"
  ..$ linewidth    : NULL
  ..$ linetype     : NULL
  ..$ lineend      : NULL
  ..$ arrow        : logi FALSE
  ..$ inherit.blank: logi FALSE
  ..- attr(*, "class")= chr [1:2] "element_line" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE

Membership Inference Test

bind_cols(
  specs,
  membership_inference_auc = results |>
    map("membership_inference_test") |>
    map_dbl("auc")
) |>
  ggplot(aes(opt_in, membership_inference_auc, color = white_multiplier)) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Opt-In Rate",
    y = "Membership Inference Test AUC",
    color = "White Multiplier"
  ) +
  scatter_grid()

results |>
    map("membership_inference_test") |>
    map("conf_mat")
[[1]]
          Truth
Prediction training holdout
  training     4206    4708
  holdout      4708    4206

[[2]]
          Truth
Prediction training holdout
  training     4424    4404
  holdout      4404    4510

[[3]]
          Truth
Prediction training holdout
  training     4670    4244
  holdout      4244    4670

[[4]]
          Truth
Prediction training holdout
  training     4213    4701
  holdout      4701    4213

[[5]]
          Truth
Prediction training holdout
  training     4482    4432
  holdout      4432    4482

[[6]]
          Truth
Prediction training holdout
  training     4747    4167
  holdout      4167    4747

[[7]]
          Truth
Prediction training holdout
  training     4342    4572
  holdout      4572    4342

[[8]]
          Truth
Prediction training holdout
  training     4418    4496
  holdout      4496    4418

[[9]]
          Truth
Prediction training holdout
  training     4702    4212
  holdout      4212    4702

[[10]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[11]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[12]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[13]]
          Truth
Prediction training holdout
  training     4205    4709
  holdout      4709    4205

[[14]]
          Truth
Prediction training holdout
  training     4468    4446
  holdout      4446    4468

[[15]]
          Truth
Prediction training holdout
  training     4698    4216
  holdout      4216    4698

[[16]]
          Truth
Prediction training holdout
  training     4129    4785
  holdout      4785    4129

[[17]]
          Truth
Prediction training holdout
  training     4502    4412
  holdout      4412    4502

[[18]]
          Truth
Prediction training holdout
  training     4720    4194
  holdout      4194    4720

[[19]]
          Truth
Prediction training holdout
  training     4338    4576
  holdout      4576    4338

[[20]]
          Truth
Prediction training holdout
  training     4505    4409
  holdout      4409    4505

[[21]]
          Truth
Prediction training holdout
  training     4702    4212
  holdout      4212    4702

[[22]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[23]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[24]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[25]]
          Truth
Prediction training holdout
  training     4193    4721
  holdout      4721    4193

[[26]]
          Truth
Prediction training holdout
  training     4466    4448
  holdout      4448    4466

[[27]]
          Truth
Prediction training holdout
  training     4708    4157
  holdout      4157    4757

[[28]]
          Truth
Prediction training holdout
  training     4214    4700
  holdout      4700    4214

[[29]]
          Truth
Prediction training holdout
  training     4424    4490
  holdout      4490    4424

[[30]]
          Truth
Prediction training holdout
  training     4726    4188
  holdout      4188    4726

[[31]]
          Truth
Prediction training holdout
  training     4322    4592
  holdout      4592    4322

[[32]]
          Truth
Prediction training holdout
  training     4461    4453
  holdout      4453    4461

[[33]]
          Truth
Prediction training holdout
  training     4673    4241
  holdout      4241    4673

[[34]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[35]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[36]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[37]]
          Truth
Prediction training holdout
  training     4146    4691
  holdout      4691    4223

[[38]]
          Truth
Prediction training holdout
  training     4454    4460
  holdout      4460    4454

[[39]]
          Truth
Prediction training holdout
  training     4704    4210
  holdout      4210    4704

[[40]]
          Truth
Prediction training holdout
  training     4141    4773
  holdout      4773    4141

[[41]]
          Truth
Prediction training holdout
  training     4462    4452
  holdout      4452    4462

[[42]]
          Truth
Prediction training holdout
  training     4690    4224
  holdout      4224    4690

[[43]]
          Truth
Prediction training holdout
  training     4293    4621
  holdout      4621    4293

[[44]]
          Truth
Prediction training holdout
  training     4455    4459
  holdout      4459    4455

[[45]]
          Truth
Prediction training holdout
  training     4721    4193
  holdout      4193    4721

[[46]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[47]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[48]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[49]]
          Truth
Prediction training holdout
  training     4189    4725
  holdout      4725    4189

[[50]]
          Truth
Prediction training holdout
  training     4481    4433
  holdout      4433    4481

[[51]]
          Truth
Prediction training holdout
  training     4700    4214
  holdout      4214    4700

[[52]]
          Truth
Prediction training holdout
  training     4139    4775
  holdout      4775    4139

[[53]]
          Truth
Prediction training holdout
  training     4392    4522
  holdout      4522    4392

[[54]]
          Truth
Prediction training holdout
  training     4722    4192
  holdout      4192    4722

[[55]]
          Truth
Prediction training holdout
  training     4362    4552
  holdout      4552    4362

[[56]]
          Truth
Prediction training holdout
  training     4419    4495
  holdout      4495    4419

[[57]]
          Truth
Prediction training holdout
  training     4691    4223
  holdout      4223    4691

[[58]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[59]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913

[[60]]
          Truth
Prediction training holdout
  training        0       1
  holdout         0    8913